
rm(list = ls())

library(tidyverse)
library(haven)
library(readxl)
library(lfe)

winsorize_x = function(x, cut = 0.001){
  cut_point_top <- quantile(x, 1 - cut, na.rm = T)
  cut_point_bottom <- quantile(x, cut, na.rm = T)
  i = which(x >= cut_point_top)
  x[i] = cut_point_top
  j = which(x <= cut_point_bottom)
  x[j] = cut_point_bottom
  return(x)
}


### Unemployment ###############################################################

spf_fcsts_unemp <- read_excel("01_InData/Individual_UNEMP.xlsx") %>% 
  select(YEAR, QUARTER, ID, INDUSTRY, UNEMP1, UNEMP2, UNEMP3, UNEMP4, UNEMP5, UNEMP6) %>% 
  mutate_if(is.character, as.numeric) %>% 
  arrange(ID, YEAR, QUARTER) %>% 
  mutate(UNEMP6_lag_1  = lag(UNEMP6, 1L),
         YEAR_lag_1    = lag(YEAR, 1L),
         QUARTER_lag_1 = lag(QUARTER, 1L)) %>% 
  filter((YEAR_lag_1 == YEAR & QUARTER_lag_1 == QUARTER - 1) |  (YEAR_lag_1 == YEAR - 1 & QUARTER_lag_1 == 4)) %>% 
  arrange(YEAR, QUARTER, ID) %>% 
  group_by(YEAR, QUARTER) %>% 
  mutate(median_prior_lag_1   = ntile(UNEMP6_lag_1, 2L),
         quintile_prior_lag_1 = ntile(UNEMP6_lag_1, 5L)) %>% 
  ungroup()

actuals_rt_unemp <- read_excel("01_InData/rucQvMd.xlsx") %>% 
  mutate(month = as.numeric(substr(DATE, 6, 7))) %>% 
  filter(month == 3 | month == 6 | month == 9 | month == 12) %>% 
  mutate(quarter = case_when(
    month ==  3 ~ 1,
    month ==  6 ~ 2,
    month ==  9 ~ 3,
    month == 12 ~ 4)) %>% 
  mutate(year = as.numeric(substr(DATE, 1, 4))) %>% 
  filter(DATE >= "1948:04") %>%
  select(-DATE) %>%
  mutate_if(is.character, as.numeric) %>%
  mutate_at(vars(matches("RUC")), as.character) %>%
  mutate(unemp_actual_init = coalesce(!!! select(., matches("RUC")))) %>%
  arrange(year, month) %>% 
  mutate(unemp_actual_init_lead_3 = lead(unemp_actual_init, 3L)) %>% 
  select(year, quarter, unemp_actual_init, unemp_actual_init_lead_3) %>% 
  mutate_if(is.character, as.numeric)

unemp_data <- spf_fcsts_unemp %>% 
  inner_join(actuals_rt_unemp, by = c("YEAR" = "year", "QUARTER" = "quarter")) %>% 
  mutate(UNEMP5       = winsorize_x(UNEMP5, cut = 0.001),
         UNEMP6_lag_1 = winsorize_x(UNEMP6_lag_1, cut = 0.001)) %>% 
  mutate(fc_3  = UNEMP5,
         fe_3  = unemp_actual_init_lead_3 - UNEMP5,
         rev_3 = UNEMP5 - UNEMP6_lag_1) %>% 
  mutate(var = "unemp") %>% 
  group_by(YEAR, QUARTER) %>% 
  mutate(growth_prior = ntile(UNEMP6_lag_1, 2L)) %>% 
  ungroup()

write_dta(unemp_data, "02_OutData/unemp_data.dta")

unemp_sumstats <- unemp_data %>% 
  group_by(YEAR, QUARTER) %>% 
  summarize(me_fe   = mean(fe_3, na.rm = TRUE),
            med_fe  = median(fe_3, na.rm = TRUE),
            sd_fe   = sd(fe_3, na.rm = TRUE),
            me_rev  = mean(rev_3, na.rm = TRUE),
            med_rev = median(rev_3, na.rm = TRUE),
            sd_rev  = sd(rev_3, na.rm = TRUE)) %>% 
  ungroup() %>% 
  summarize(mean_me_fe   = mean(me_fe, na.rm = TRUE),
            mean_med_fe  = mean(med_fe, na.rm = TRUE),
            mean_sd_fe   = mean(sd_fe, na.rm = TRUE),
            mean_me_rev  = mean(me_rev, na.rm = TRUE),
            mean_med_rev = mean(med_rev, na.rm = TRUE),
            mean_sd_rev  = mean(sd_rev, na.rm = TRUE)) %>% 
  mutate(var = "unemp")

n_unemp <- unemp_data %>% 
  filter(!is.na(fe_3),
         !is.na(rev_3),
         !is.na(median_prior_lag_1)) %>% 
  summarize(n = n())


### Inflation (CPI) ############################################################

spf_fcsts_cpi <- read_excel("01_InData/Individual_RGDP.xlsx") %>% 
  select(YEAR, QUARTER, ID, INDUSTRY, RGDP1, RGDP2, RGDP3, RGDP4, RGDP5, RGDP6) %>%
  mutate_if(is.character, as.numeric) %>%
  arrange(ID, YEAR, QUARTER) %>%
  mutate(RGDP2_lag_1    = lag(RGDP2, 1L),
         RGDP3_lag_1    = lag(RGDP3, 1L),
         RGDP4_lag_1    = lag(RGDP4, 1L),
         RGDP5_lag_1    = lag(RGDP5, 1L),
         RGDP6_lag_1    = lag(RGDP6, 1L),
         YEAR_lag_1    = lag(YEAR, 1L),
         QUARTER_lag_1 = lag(QUARTER, 1L)) %>%
  filter((YEAR_lag_1 == YEAR & QUARTER_lag_1 == QUARTER - 1) |  (YEAR_lag_1 == YEAR - 1 & QUARTER_lag_1 == 4)) %>%
  arrange(YEAR, QUARTER, ID) %>%
  group_by(YEAR, QUARTER) %>%
  mutate(median_prior_lag_1   = ntile(RGDP6_lag_1, 2L),
         quintile_prior_lag_1 = ntile(RGDP6_lag_1, 5L)) %>%
  ungroup()

actuals_rt_cpi <- read_excel("01_InData/ROUTPUTQvQd.xlsx") %>% 
  mutate(year    = as.numeric(substr(DATE, 1, 4)),
         quarter = as.numeric(substr(DATE, 7, 7))) %>% 
  select(-DATE) %>%
  mutate_if(is.character, as.numeric) %>%
  mutate_at(vars(matches("ROUTPUT")), as.character) %>%
  mutate(cpi_actual_init = coalesce(!!! select(., matches("ROUTPUT")))) %>%
  arrange(year, quarter) %>% 
  mutate(cpi_actual_init_lag_1  = lag(cpi_actual_init, 1L),
         cpi_actual_init_lead_3 = lead(cpi_actual_init, 3L)) %>% 
  select(year, quarter, cpi_actual_init, cpi_actual_init_lag_1, cpi_actual_init_lead_3) %>% 
  mutate_if(is.character, as.numeric)

cpi_data <- spf_fcsts_cpi %>%
  inner_join(actuals_rt_cpi, by = c("YEAR" = "year", "QUARTER" = "quarter")) %>%
  mutate(RGDP2       = winsorize_x(RGDP2, cut = 0.001),
         RGDP3       = winsorize_x(RGDP3, cut = 0.001),
         RGDP4       = winsorize_x(RGDP4, cut = 0.001),
         RGDP5       = winsorize_x(RGDP5, cut = 0.001),
         RGDP2_lag_1 = winsorize_x(RGDP2_lag_1, cut = 0.001),
         RGDP3_lag_1 = winsorize_x(RGDP3_lag_1, cut = 0.001),
         RGDP4_lag_1 = winsorize_x(RGDP4_lag_1, cut = 0.001),
         RGDP5_lag_1 = winsorize_x(RGDP5_lag_1, cut = 0.001),
         RGDP6_lag_1 = winsorize_x(RGDP6_lag_1, cut = 0.001)) %>%
  mutate(fc_3  = ((RGDP2 / RGDP1 - 1) + (RGDP3 / RGDP2 - 1) + (RGDP4 / RGDP3 - 1) + (RGDP5 / RGDP4 - 1)),
         fe_3  = (cpi_actual_init_lead_3 / cpi_actual_init_lag_1 - 1) - ((RGDP2 / RGDP1 - 1) + (RGDP3 / RGDP2 - 1) + (RGDP4 / RGDP3 - 1) + (RGDP5 / RGDP4 - 1)),
         rev_3 = ((RGDP2 / RGDP1 - 1) + (RGDP3 / RGDP2 - 1) + (RGDP4 / RGDP3 - 1) + (RGDP5 / RGDP4 - 1)) - ((RGDP3_lag_1 / RGDP2_lag_1 - 1) + (RGDP4_lag_1 / RGDP3_lag_1 - 1) + (RGDP5_lag_1 / RGDP4_lag_1 - 1) + (RGDP6_lag_1 / RGDP5_lag_1 - 1))) %>% 
  mutate(var = "cpi") %>% 
  group_by(YEAR, QUARTER) %>% 
  mutate(growth_prior = ntile(((RGDP3_lag_1 / RGDP2_lag_1 - 1) + (RGDP4_lag_1 / RGDP3_lag_1 - 1) + (RGDP5_lag_1 / RGDP4_lag_1 - 1) + (RGDP6_lag_1 / RGDP5_lag_1 - 1)), 2L)) %>% 
  ungroup()

write_dta(cpi_data, "02_OutData/cpi_data.dta")

cpi_sumstats <- cpi_data %>% 
  group_by(YEAR, QUARTER) %>% 
  summarize(me_fe   = mean(fe_3, na.rm = TRUE),
            med_fe  = median(fe_3, na.rm = TRUE),
            sd_fe   = sd(fe_3, na.rm = TRUE),
            me_rev  = mean(rev_3, na.rm = TRUE),
            med_rev = median(rev_3, na.rm = TRUE),
            sd_rev  = sd(rev_3, na.rm = TRUE)) %>% 
  ungroup() %>% 
  summarize(mean_me_fe   = mean(me_fe, na.rm = TRUE),
            mean_med_fe  = mean(med_fe, na.rm = TRUE),
            mean_sd_fe   = mean(sd_fe, na.rm = TRUE),
            mean_me_rev  = mean(me_rev, na.rm = TRUE),
            mean_med_rev = mean(med_rev, na.rm = TRUE),
            mean_sd_rev  = mean(sd_rev, na.rm = TRUE)) %>% 
  mutate(var = "cpi")

n_cpi <- cpi_data %>% 
  filter(!is.na(fe_3),
         !is.na(rev_3),
         !is.na(median_prior_lag_1)) %>% 
  summarize(n = n())


### Nominal GDP ################################################################

spf_fcsts_ngdp <- read_excel("01_InData/Individual_NGDP.xlsx") %>% 
  select(YEAR, QUARTER, ID, INDUSTRY, NGDP1, NGDP2, NGDP3, NGDP4, NGDP5, NGDP6) %>% 
  mutate_if(is.character, as.numeric) %>% 
  arrange(ID, YEAR, QUARTER) %>% 
  mutate(NGDP2_lag_1   = lag(NGDP2, 1L),
         NGDP6_lag_1   = lag(NGDP6, 1L),
         YEAR_lag_1    = lag(YEAR, 1L),
         QUARTER_lag_1 = lag(QUARTER, 1L)) %>% 
  filter((YEAR_lag_1 == YEAR & QUARTER_lag_1 == QUARTER - 1) |  (YEAR_lag_1 == YEAR - 1 & QUARTER_lag_1 == 4)) %>% 
  arrange(YEAR, QUARTER, ID) %>% 
  group_by(YEAR, QUARTER) %>% 
  mutate(median_prior_lag_1   = ntile(NGDP6_lag_1, 2L),
         quintile_prior_lag_1 = ntile(NGDP6_lag_1, 5L)) %>% 
  ungroup()

actuals_rt_ngdp <- read_excel("01_InData/NOUTPUTQvQd.xlsx") %>% 
  mutate(year    = as.numeric(substr(DATE, 1, 4)),
         quarter = as.numeric(substr(DATE, 7, 7))) %>% 
  select(-DATE) %>%
  mutate_if(is.character, as.numeric) %>%
  mutate_at(vars(matches("NOUTPUT")), as.character) %>%
  mutate(ngdp_actual_init = coalesce(!!! select(., matches("NOUTPUT")))) %>%
  arrange(year, quarter) %>% 
  mutate(ngdp_actual_init_lag_1  = lag(ngdp_actual_init, 1L),
         ngdp_actual_init_lead_3 = lead(ngdp_actual_init, 3L)) %>% 
  select(year, quarter, ngdp_actual_init, ngdp_actual_init_lag_1, ngdp_actual_init_lead_3) %>% 
  mutate_if(is.character, as.numeric)

ngdp_data <- spf_fcsts_ngdp %>% 
  inner_join(actuals_rt_ngdp, by = c("YEAR" = "year", "QUARTER" = "quarter")) %>% 
  mutate(NGDP5       = winsorize_x(NGDP5, cut = 0.001),
         NGDP6_lag_1 = winsorize_x(NGDP6_lag_1, cut = 0.001),
         NGDP2_lag_1 = winsorize_x(NGDP2_lag_1, cut = 0.001)) %>% 
  mutate(fc_3  = (NGDP5 / NGDP1 - 1),
         fe_3  = (ngdp_actual_init_lead_3 / ngdp_actual_init_lag_1 - 1) - (NGDP5 / NGDP1 - 1),
         rev_3 = (NGDP5 / NGDP1 - 1) - (NGDP6_lag_1 / NGDP2_lag_1 - 1)) %>% 
  mutate(var = "ngdp") %>% 
  group_by(YEAR, QUARTER) %>% 
  mutate(growth_prior = ntile((NGDP6_lag_1 / NGDP2_lag_1 - 1), 2L)) %>% 
  ungroup()


write_dta(ngdp_data, "02_OutData/ngdp_data.dta")

ngdp_sumstats <- ngdp_data %>% 
  group_by(YEAR, QUARTER) %>% 
  summarize(me_fe   = mean(fe_3, na.rm = TRUE),
            med_fe  = median(fe_3, na.rm = TRUE),
            sd_fe   = sd(fe_3, na.rm = TRUE),
            me_rev  = mean(rev_3, na.rm = TRUE),
            med_rev = median(rev_3, na.rm = TRUE),
            sd_rev  = sd(rev_3, na.rm = TRUE)) %>% 
  ungroup() %>% 
  summarize(mean_me_fe   = mean(me_fe, na.rm = TRUE),
            mean_med_fe  = mean(med_fe, na.rm = TRUE),
            mean_sd_fe   = mean(sd_fe, na.rm = TRUE),
            mean_me_rev  = mean(me_rev, na.rm = TRUE),
            mean_med_rev = mean(med_rev, na.rm = TRUE),
            mean_sd_rev  = mean(sd_rev, na.rm = TRUE)) %>% 
  mutate(var = "ngdp")

n_ngdp <- ngdp_data %>% 
  filter(!is.na(fe_3),
         !is.na(rev_3),
         !is.na(median_prior_lag_1)) %>% 
  summarize(n = n())


### Real GDP ###################################################################

spf_fcsts_rgdp <- read_excel("01_InData/Individual_RGDP.xlsx") %>% 
  select(YEAR, QUARTER, ID, INDUSTRY, RGDP1, RGDP2, RGDP3, RGDP4, RGDP5, RGDP6) %>% 
  mutate_if(is.character, as.numeric) %>% 
  arrange(ID, YEAR, QUARTER) %>% 
  mutate(RGDP2_lag_1   = lag(RGDP2, 1L),
         RGDP6_lag_1   = lag(RGDP6, 1L),
         YEAR_lag_1    = lag(YEAR, 1L),
         QUARTER_lag_1 = lag(QUARTER, 1L)) %>% 
  filter((YEAR_lag_1 == YEAR & QUARTER_lag_1 == QUARTER - 1) |  (YEAR_lag_1 == YEAR - 1 & QUARTER_lag_1 == 4)) %>% 
  arrange(YEAR, QUARTER, ID) %>% 
  group_by(YEAR, QUARTER) %>% 
  mutate(median_prior_lag_1   = ntile(RGDP6_lag_1, 2L),
         quintile_prior_lag_1 = ntile(RGDP6_lag_1, 5L)) %>% 
  ungroup()

actuals_rt_rgdp <- read_excel("01_InData/ROUTPUTQvQd.xlsx") %>% 
  mutate(year    = as.numeric(substr(DATE, 1, 4)),
         quarter = as.numeric(substr(DATE, 7, 7))) %>% 
  select(-DATE) %>%
  mutate_if(is.character, as.numeric) %>%
  mutate_at(vars(matches("ROUTPUT")), as.character) %>%
  mutate(rgdp_actual_init = coalesce(!!! select(., matches("ROUTPUT")))) %>%
  arrange(year, quarter) %>% 
  mutate(rgdp_actual_init_lag_1  = lag(rgdp_actual_init, 1L),
         rgdp_actual_init_lead_3 = lead(rgdp_actual_init, 3L)) %>% 
  select(year, quarter, rgdp_actual_init, rgdp_actual_init_lag_1, rgdp_actual_init_lead_3) %>% 
  mutate_if(is.character, as.numeric)

rgdp_data <- spf_fcsts_rgdp %>% 
  inner_join(actuals_rt_rgdp, by = c("YEAR" = "year", "QUARTER" = "quarter")) %>% 
  mutate(RGDP5       = winsorize_x(RGDP5, cut = 0.001),
         RGDP6_lag_1 = winsorize_x(RGDP6_lag_1, cut = 0.001),
         RGDP2_lag_1 = winsorize_x(RGDP2_lag_1, cut = 0.001)) %>% 
  mutate(fc_3  = (RGDP5 / RGDP1 - 1),
         fe_3  = (rgdp_actual_init_lead_3 / rgdp_actual_init_lag_1 - 1) - (RGDP5 / RGDP1 - 1),
         rev_3 = (RGDP5 / RGDP1 - 1) - (RGDP6_lag_1 / RGDP2_lag_1 - 1)) %>% 
  mutate(var = "rgdp") %>% 
  group_by(YEAR, QUARTER) %>% 
  mutate(growth_prior = ntile((RGDP6_lag_1 / RGDP2_lag_1 - 1), 2L)) %>% 
  ungroup()

write_dta(rgdp_data, "02_OutData/rgdp_data.dta")

rgdp_sumstats <- rgdp_data %>% 
  group_by(YEAR, QUARTER) %>% 
  summarize(me_fe   = mean(fe_3, na.rm = TRUE),
            med_fe  = median(fe_3, na.rm = TRUE),
            sd_fe   = sd(fe_3, na.rm = TRUE),
            me_rev  = mean(rev_3, na.rm = TRUE),
            med_rev = median(rev_3, na.rm = TRUE),
            sd_rev  = sd(rev_3, na.rm = TRUE)) %>% 
  ungroup() %>% 
  summarize(mean_me_fe   = mean(me_fe, na.rm = TRUE),
            mean_med_fe  = mean(med_fe, na.rm = TRUE),
            mean_sd_fe   = mean(sd_fe, na.rm = TRUE),
            mean_me_rev  = mean(me_rev, na.rm = TRUE),
            mean_med_rev = mean(med_rev, na.rm = TRUE),
            mean_sd_rev  = mean(sd_rev, na.rm = TRUE)) %>% 
  mutate(var = "rgdp")

n_rgdp <- rgdp_data %>% 
  filter(!is.na(fe_3),
         !is.na(rev_3),
         !is.na(median_prior_lag_1)) %>% 
  summarize(n = n())


### Price Index for GDP ########################################################

spf_fcsts_pgdp <- read_excel("01_InData/Individual_PGDP.xlsx") %>% 
  select(YEAR, QUARTER, ID, INDUSTRY, PGDP1, PGDP2, PGDP3, PGDP4, PGDP5, PGDP6) %>% 
  mutate_if(is.character, as.numeric) %>% 
  arrange(ID, YEAR, QUARTER) %>% 
  mutate(PGDP2_lag_1   = lag(PGDP2, 1L),
         PGDP6_lag_1   = lag(PGDP6, 1L),
         YEAR_lag_1    = lag(YEAR, 1L),
         QUARTER_lag_1 = lag(QUARTER, 1L)) %>% 
  filter((YEAR_lag_1 == YEAR & QUARTER_lag_1 == QUARTER - 1) |  (YEAR_lag_1 == YEAR - 1 & QUARTER_lag_1 == 4)) %>% 
  arrange(YEAR, QUARTER, ID) %>% 
  group_by(YEAR, QUARTER) %>% 
  mutate(median_prior_lag_1   = ntile(PGDP6_lag_1, 2L),
         quintile_prior_lag_1 = ntile(PGDP6_lag_1, 5L)) %>% 
  ungroup()

actuals_rt_pgdp <- read_excel("01_InData/PQvQd.xlsx") %>% 
  mutate(year    = as.numeric(substr(DATE, 1, 4)),
         quarter = as.numeric(substr(DATE, 7, 7))) %>% 
  select(-DATE) %>%
  mutate_if(is.character, as.numeric) %>%
  mutate_at(vars(matches("P")), as.character) %>%
  mutate(pgdp_actual_init = coalesce(!!! select(., matches("P")))) %>%
  arrange(year, quarter) %>% 
  mutate(pgdp_actual_init_lag_1  = lag(pgdp_actual_init, 1L),
         pgdp_actual_init_lead_3 = lead(pgdp_actual_init, 3L)) %>% 
  select(year, quarter, pgdp_actual_init, pgdp_actual_init_lag_1, pgdp_actual_init_lead_3) %>% 
  mutate_if(is.character, as.numeric)

pgdp_data <- spf_fcsts_pgdp %>% 
  inner_join(actuals_rt_pgdp, by = c("YEAR" = "year", "QUARTER" = "quarter")) %>% 
  mutate(PGDP5       = winsorize_x(PGDP5, cut = 0.001),
         PGDP6_lag_1 = winsorize_x(PGDP6_lag_1, cut = 0.001),
         PGDP2_lag_1 = winsorize_x(PGDP2_lag_1, cut = 0.001)) %>% 
  mutate(fc_3  = (PGDP5 / PGDP1 - 1),
         fe_3  = (pgdp_actual_init_lead_3 / pgdp_actual_init_lag_1 - 1) - (PGDP5 / PGDP1 - 1),
         rev_3 = (PGDP5 / PGDP1 - 1) - (PGDP6_lag_1 / PGDP2_lag_1 - 1)) %>% 
  mutate(var = "pgdp") %>% 
  group_by(YEAR, QUARTER) %>% 
  mutate(growth_prior = ntile((PGDP6_lag_1 / PGDP2_lag_1 - 1), 2L)) %>% 
  ungroup()

write_dta(pgdp_data, "02_OutData/pgdp_data.dta")

pgdp_sumstats <- pgdp_data %>% 
  group_by(YEAR, QUARTER) %>% 
  summarize(me_fe   = mean(fe_3, na.rm = TRUE),
            med_fe  = median(fe_3, na.rm = TRUE),
            sd_fe   = sd(fe_3, na.rm = TRUE),
            me_rev  = mean(rev_3, na.rm = TRUE),
            med_rev = median(rev_3, na.rm = TRUE),
            sd_rev  = sd(rev_3, na.rm = TRUE)) %>% 
  ungroup() %>% 
  summarize(mean_me_fe   = mean(me_fe, na.rm = TRUE),
            mean_med_fe  = mean(med_fe, na.rm = TRUE),
            mean_sd_fe   = mean(sd_fe, na.rm = TRUE),
            mean_me_rev  = mean(me_rev, na.rm = TRUE),
            mean_med_rev = mean(med_rev, na.rm = TRUE),
            mean_sd_rev  = mean(sd_rev, na.rm = TRUE)) %>% 
  mutate(var = "pgdp")

n_pgdp <- pgdp_data %>% 
  filter(!is.na(fe_3),
         !is.na(rev_3),
         !is.na(median_prior_lag_1)) %>% 
  summarize(n = n())


### Industrial Production Index ################################################

spf_fcsts_indprod <- read_excel("01_InData/Individual_INDPROD.xlsx") %>% 
  select(YEAR, QUARTER, ID, INDUSTRY, INDPROD1, INDPROD2, INDPROD3, INDPROD4, INDPROD5, INDPROD6) %>% 
  mutate_if(is.character, as.numeric) %>% 
  arrange(ID, YEAR, QUARTER) %>% 
  mutate(INDPROD2_lag_1   = lag(INDPROD2, 1L),
         INDPROD6_lag_1   = lag(INDPROD6, 1L),
         YEAR_lag_1    = lag(YEAR, 1L),
         QUARTER_lag_1 = lag(QUARTER, 1L)) %>% 
  filter((YEAR_lag_1 == YEAR & QUARTER_lag_1 == QUARTER - 1) |  (YEAR_lag_1 == YEAR - 1 & QUARTER_lag_1 == 4)) %>% 
  arrange(YEAR, QUARTER, ID) %>% 
  group_by(YEAR, QUARTER) %>% 
  mutate(median_prior_lag_1   = ntile(INDPROD6_lag_1, 2L),
         quintile_prior_lag_1 = ntile(INDPROD6_lag_1, 5L)) %>% 
  ungroup()

actuals_rt_indprod <- read_excel("01_InData/iptMvMd.xlsx") %>% 
  mutate(month = as.numeric(substr(DATE, 6, 7))) %>% 
  filter(month == 3 | month == 6 | month == 9 | month == 12) %>% 
  mutate(quarter = case_when(
    month ==  3 ~ 1,
    month ==  6 ~ 2,
    month ==  9 ~ 3,
    month == 12 ~ 4)) %>% 
  mutate(year = as.numeric(substr(DATE, 1, 4))) %>% 
  filter(DATE >= "1947:01") %>%
  select(-DATE) %>%
  mutate_if(is.character, as.numeric) %>%
  mutate_at(vars(matches("IPT")), as.character) %>%
  mutate(indprod_actual_init = coalesce(!!! select(., matches("IPT")))) %>%
  arrange(year, month) %>% 
  mutate(indprod_actual_init_lag_1  = lag(indprod_actual_init, 1L),
         indprod_actual_init_lead_3 = lead(indprod_actual_init, 3L)) %>% 
  select(year, quarter, indprod_actual_init, indprod_actual_init_lag_1, indprod_actual_init_lead_3) %>% 
  mutate_if(is.character, as.numeric)

indprod_data <- spf_fcsts_indprod %>% 
  inner_join(actuals_rt_indprod, by = c("YEAR" = "year", "QUARTER" = "quarter")) %>% 
  mutate(INDPROD5       = winsorize_x(INDPROD5, cut = 0.001),
         INDPROD6_lag_1 = winsorize_x(INDPROD6_lag_1, cut = 0.001),
         INDPROD2_lag_1 = winsorize_x(INDPROD2_lag_1, cut = 0.001)) %>% 
  mutate(fc_3  = (INDPROD5 / INDPROD1 - 1),
         fe_3  = (indprod_actual_init_lead_3 / indprod_actual_init_lag_1 - 1) - (INDPROD5 / INDPROD1 - 1),
         rev_3 = (INDPROD5 / INDPROD1 - 1) - (INDPROD6_lag_1 / INDPROD2_lag_1 - 1)) %>% 
  mutate(var = "indprod") %>% 
  group_by(YEAR, QUARTER) %>% 
  mutate(growth_prior = ntile((INDPROD6_lag_1 / INDPROD2_lag_1 - 1), 2L)) %>% 
  ungroup()

write_dta(indprod_data, "02_OutData/indprod_data.dta")

indprod_sumstats <- indprod_data %>% 
  group_by(YEAR, QUARTER) %>% 
  summarize(me_fe   = mean(fe_3, na.rm = TRUE),
            med_fe  = median(fe_3, na.rm = TRUE),
            sd_fe   = sd(fe_3, na.rm = TRUE),
            me_rev  = mean(rev_3, na.rm = TRUE),
            med_rev = median(rev_3, na.rm = TRUE),
            sd_rev  = sd(rev_3, na.rm = TRUE)) %>% 
  ungroup() %>% 
  summarize(mean_me_fe   = mean(me_fe, na.rm = TRUE),
            mean_med_fe  = mean(med_fe, na.rm = TRUE),
            mean_sd_fe   = mean(sd_fe, na.rm = TRUE),
            mean_me_rev  = mean(me_rev, na.rm = TRUE),
            mean_med_rev = mean(med_rev, na.rm = TRUE),
            mean_sd_rev  = mean(sd_rev, na.rm = TRUE)) %>% 
  mutate(var = "indprod")

n_indprod <- indprod_data %>% 
  filter(!is.na(fe_3),
         !is.na(rev_3),
         !is.na(median_prior_lag_1)) %>% 
  summarize(n = n())


### Real Personal Consumption Expenditures #####################################

spf_fcsts_rconsum <- read_excel("01_InData/Individual_RCONSUM.xlsx") %>% 
  select(YEAR, QUARTER, ID, INDUSTRY, RCONSUM1, RCONSUM2, RCONSUM3, RCONSUM4, RCONSUM5, RCONSUM6) %>% 
  mutate_if(is.character, as.numeric) %>% 
  arrange(ID, YEAR, QUARTER) %>% 
  mutate(RCONSUM2_lag_1   = lag(RCONSUM2, 1L),
         RCONSUM6_lag_1   = lag(RCONSUM6, 1L),
         YEAR_lag_1    = lag(YEAR, 1L),
         QUARTER_lag_1 = lag(QUARTER, 1L)) %>% 
  filter((YEAR_lag_1 == YEAR & QUARTER_lag_1 == QUARTER - 1) |  (YEAR_lag_1 == YEAR - 1 & QUARTER_lag_1 == 4)) %>% 
  arrange(YEAR, QUARTER, ID) %>% 
  group_by(YEAR, QUARTER) %>% 
  mutate(median_prior_lag_1   = ntile(RCONSUM6_lag_1, 2L),
         quintile_prior_lag_1 = ntile(RCONSUM6_lag_1, 5L)) %>% 
  ungroup()

actuals_rt_rconsum <- read_excel("01_InData/RCONQvQd.xlsx") %>% 
  mutate(year    = as.numeric(substr(DATE, 1, 4)),
         quarter = as.numeric(substr(DATE, 7, 7))) %>% 
  select(-DATE) %>%
  mutate_if(is.character, as.numeric) %>%
  mutate_at(vars(matches("RCON")), as.character) %>%
  mutate(rconsum_actual_init = coalesce(!!! select(., matches("RCON")))) %>%
  arrange(year, quarter) %>% 
  mutate(rconsum_actual_init_lag_1  = lag(rconsum_actual_init, 1L),
         rconsum_actual_init_lead_3 = lead(rconsum_actual_init, 3L)) %>% 
  select(year, quarter, rconsum_actual_init, rconsum_actual_init_lag_1, rconsum_actual_init_lead_3) %>% 
  mutate_if(is.character, as.numeric)

rconsum_data <- spf_fcsts_rconsum %>% 
  inner_join(actuals_rt_rconsum, by = c("YEAR" = "year", "QUARTER" = "quarter")) %>% 
  mutate(RCONSUM5       = winsorize_x(RCONSUM5, cut = 0.001),
         RCONSUM6_lag_1 = winsorize_x(RCONSUM6_lag_1, cut = 0.001),
         RCONSUM2_lag_1 = winsorize_x(RCONSUM2_lag_1, cut = 0.001)) %>%
  mutate(fc_3  = (RCONSUM5 / RCONSUM1 - 1),
         fe_3  = (rconsum_actual_init_lead_3 / rconsum_actual_init_lag_1 - 1) - (RCONSUM5 / RCONSUM1 - 1),
         rev_3 = (RCONSUM5 / RCONSUM1 - 1) - (RCONSUM6_lag_1 / RCONSUM2_lag_1 - 1)) %>% 
  mutate(var = "rconsum") %>% 
  group_by(YEAR, QUARTER) %>% 
  mutate(growth_prior = ntile((RCONSUM6_lag_1 / RCONSUM2_lag_1 - 1), 2L)) %>% 
  ungroup()

write_dta(rconsum_data, "02_OutData/rconsum_data.dta")

rconsum_sumstats <- rconsum_data %>% 
  group_by(YEAR, QUARTER) %>% 
  summarize(me_fe   = mean(fe_3, na.rm = TRUE),
            med_fe  = median(fe_3, na.rm = TRUE),
            sd_fe   = sd(fe_3, na.rm = TRUE),
            me_rev  = mean(rev_3, na.rm = TRUE),
            med_rev = median(rev_3, na.rm = TRUE),
            sd_rev  = sd(rev_3, na.rm = TRUE)) %>% 
  ungroup() %>% 
  summarize(mean_me_fe   = mean(me_fe, na.rm = TRUE),
            mean_med_fe  = mean(med_fe, na.rm = TRUE),
            mean_sd_fe   = mean(sd_fe, na.rm = TRUE),
            mean_me_rev  = mean(me_rev, na.rm = TRUE),
            mean_med_rev = mean(med_rev, na.rm = TRUE),
            mean_sd_rev  = mean(sd_rev, na.rm = TRUE)) %>% 
  mutate(var = "rconsum")

n_rconsum <- rconsum_data %>% 
  filter(!is.na(fe_3),
         !is.na(rev_3),
         !is.na(median_prior_lag_1)) %>% 
  summarize(n = n())


### Real Nonresidential Fixed Investment #######################################

spf_fcsts_rnresin <- read_excel("01_InData/Individual_RNRESIN.xlsx") %>% 
  select(YEAR, QUARTER, ID, INDUSTRY, RNRESIN1, RNRESIN2, RNRESIN3, RNRESIN4, RNRESIN5, RNRESIN6) %>% 
  mutate_if(is.character, as.numeric) %>% 
  arrange(ID, YEAR, QUARTER) %>% 
  mutate(RNRESIN2_lag_1   = lag(RNRESIN2, 1L),
         RNRESIN6_lag_1   = lag(RNRESIN6, 1L),
         YEAR_lag_1    = lag(YEAR, 1L),
         QUARTER_lag_1 = lag(QUARTER, 1L)) %>% 
  filter((YEAR_lag_1 == YEAR & QUARTER_lag_1 == QUARTER - 1) |  (YEAR_lag_1 == YEAR - 1 & QUARTER_lag_1 == 4)) %>% 
  arrange(YEAR, QUARTER, ID) %>% 
  group_by(YEAR, QUARTER) %>% 
  mutate(median_prior_lag_1   = ntile(RNRESIN6_lag_1, 2L),
         quintile_prior_lag_1 = ntile(RNRESIN6_lag_1, 5L)) %>% 
  ungroup()

actuals_rt_rnresin <- read_excel("01_InData/rinvbfQvQd.xlsx") %>% 
  mutate(year    = as.numeric(substr(DATE, 1, 4)),
         quarter = as.numeric(substr(DATE, 7, 7))) %>% 
  select(-DATE) %>%
  mutate_if(is.character, as.numeric) %>%
  mutate_at(vars(matches("rinvbf")), as.character) %>%
  mutate(rnresin_actual_init = coalesce(!!! select(., matches("rinvbf")))) %>%
  arrange(year, quarter) %>% 
  mutate(rnresin_actual_init_lag_1  = lag(rnresin_actual_init, 1L),
         rnresin_actual_init_lead_3 = lead(rnresin_actual_init, 3L)) %>% 
  select(year, quarter, rnresin_actual_init, rnresin_actual_init_lag_1, rnresin_actual_init_lead_3) %>% 
  mutate_if(is.character, as.numeric)

rnresin_data <- spf_fcsts_rnresin %>% 
  inner_join(actuals_rt_rnresin, by = c("YEAR" = "year", "QUARTER" = "quarter")) %>% 
  mutate(RNRESIN5       = winsorize_x(RNRESIN5, cut = 0.001),
         RNRESIN6_lag_1 = winsorize_x(RNRESIN6_lag_1, cut = 0.001),
         RNRESIN2_lag_1 = winsorize_x(RNRESIN2_lag_1, cut = 0.001)) %>%
  mutate(fc_3  = (RNRESIN5 / RNRESIN1 - 1),
         fe_3  = (rnresin_actual_init_lead_3 / rnresin_actual_init_lag_1 - 1) - (RNRESIN5 / RNRESIN1 - 1),
         rev_3 = (RNRESIN5 / RNRESIN1 - 1) - (RNRESIN6_lag_1 / RNRESIN2_lag_1 - 1)) %>% 
  mutate(var = "rnresin") %>% 
  group_by(YEAR, QUARTER) %>% 
  mutate(growth_prior = ntile((RNRESIN6_lag_1 / RNRESIN2_lag_1 - 1), 2L)) %>% 
  ungroup()

write_dta(rnresin_data, "02_OutData/rnresin_data.dta")

rnresin_sumstats <- rnresin_data %>% 
  group_by(YEAR, QUARTER) %>% 
  summarize(me_fe   = mean(fe_3, na.rm = TRUE),
            med_fe  = median(fe_3, na.rm = TRUE),
            sd_fe   = sd(fe_3, na.rm = TRUE),
            me_rev  = mean(rev_3, na.rm = TRUE),
            med_rev = median(rev_3, na.rm = TRUE),
            sd_rev  = sd(rev_3, na.rm = TRUE)) %>% 
  ungroup() %>% 
  summarize(mean_me_fe   = mean(me_fe, na.rm = TRUE),
            mean_med_fe  = mean(med_fe, na.rm = TRUE),
            mean_sd_fe   = mean(sd_fe, na.rm = TRUE),
            mean_me_rev  = mean(me_rev, na.rm = TRUE),
            mean_med_rev = mean(med_rev, na.rm = TRUE),
            mean_sd_rev  = mean(sd_rev, na.rm = TRUE)) %>% 
  mutate(var = "rnresin")

n_rnresin <- rnresin_data %>% 
  filter(!is.na(fe_3),
         !is.na(rev_3),
         !is.na(median_prior_lag_1)) %>% 
  summarize(n = n())


### Real Residential Fixed Investment ##########################################

spf_fcsts_rresinv <- read_excel("01_InData/Individual_RRESINV.xlsx") %>% 
  select(YEAR, QUARTER, ID, INDUSTRY, RRESINV1, RRESINV2, RRESINV3, RRESINV4, RRESINV5, RRESINV6) %>% 
  mutate_if(is.character, as.numeric) %>% 
  arrange(ID, YEAR, QUARTER) %>% 
  mutate(RRESINV2_lag_1   = lag(RRESINV2, 1L),
         RRESINV6_lag_1   = lag(RRESINV6, 1L),
         YEAR_lag_1    = lag(YEAR, 1L),
         QUARTER_lag_1 = lag(QUARTER, 1L)) %>% 
  filter((YEAR_lag_1 == YEAR & QUARTER_lag_1 == QUARTER - 1) |  (YEAR_lag_1 == YEAR - 1 & QUARTER_lag_1 == 4)) %>% 
  arrange(YEAR, QUARTER, ID) %>% 
  group_by(YEAR, QUARTER) %>% 
  mutate(median_prior_lag_1   = ntile(RRESINV6_lag_1, 2L),
         quintile_prior_lag_1 = ntile(RRESINV6_lag_1, 5L)) %>% 
  ungroup()

actuals_rt_rresinv <- read_excel("01_InData/rinvresidQvQd.xlsx") %>% 
  mutate(year    = as.numeric(substr(DATE, 1, 4)),
         quarter = as.numeric(substr(DATE, 7, 7))) %>% 
  select(-DATE) %>%
  mutate_if(is.character, as.numeric) %>%
  mutate_at(vars(matches("rinvresid")), as.character) %>%
  mutate(rresinv_actual_init = coalesce(!!! select(., matches("rinvresid")))) %>%
  arrange(year, quarter) %>% 
  mutate(rresinv_actual_init_lag_1  = lag(rresinv_actual_init, 1L),
         rresinv_actual_init_lead_3 = lead(rresinv_actual_init, 3L)) %>% 
  select(year, quarter, rresinv_actual_init, rresinv_actual_init_lag_1, rresinv_actual_init_lead_3) %>% 
  mutate_if(is.character, as.numeric)

rresinv_data <- spf_fcsts_rresinv %>% 
  inner_join(actuals_rt_rresinv, by = c("YEAR" = "year", "QUARTER" = "quarter")) %>% 
  mutate(RRESINV5       = winsorize_x(RRESINV5, cut = 0.001),
         RRESINV6_lag_1 = winsorize_x(RRESINV6_lag_1, cut = 0.001),
         RRESINV2_lag_1 = winsorize_x(RRESINV2_lag_1, cut = 0.001)) %>%
  mutate(fc_3  = (RRESINV5 / RRESINV1 - 1),
         fe_3  = (rresinv_actual_init_lead_3 / rresinv_actual_init_lag_1 - 1) - (RRESINV5 / RRESINV1 - 1),
         rev_3 = (RRESINV5 / RRESINV1 - 1) - (RRESINV6_lag_1 / RRESINV2_lag_1 - 1)) %>% 
  mutate(var = "rresinv") %>% 
  group_by(YEAR, QUARTER) %>% 
  mutate(growth_prior = ntile((RRESINV6_lag_1 / RRESINV2_lag_1 - 1), 2L)) %>% 
  ungroup()

write_dta(rresinv_data, "02_OutData/rresinv_data.dta")

rresinv_sumstats <- rresinv_data %>% 
  group_by(YEAR, QUARTER) %>% 
  summarize(me_fe   = mean(fe_3, na.rm = TRUE),
            med_fe  = median(fe_3, na.rm = TRUE),
            sd_fe   = sd(fe_3, na.rm = TRUE),
            me_rev  = mean(rev_3, na.rm = TRUE),
            med_rev = median(rev_3, na.rm = TRUE),
            sd_rev  = sd(rev_3, na.rm = TRUE)) %>% 
  ungroup() %>% 
  summarize(mean_me_fe   = mean(me_fe, na.rm = TRUE),
            mean_med_fe  = mean(med_fe, na.rm = TRUE),
            mean_sd_fe   = mean(sd_fe, na.rm = TRUE),
            mean_me_rev  = mean(me_rev, na.rm = TRUE),
            mean_med_rev = mean(med_rev, na.rm = TRUE),
            mean_sd_rev  = mean(sd_rev, na.rm = TRUE)) %>% 
  mutate(var = "rresinv")

n_rresinv <- rresinv_data %>% 
  filter(!is.na(fe_3),
         !is.na(rev_3),
         !is.na(median_prior_lag_1)) %>% 
  summarize(n = n())


### Real Federal Government Consumption Expenditures ###########################

spf_fcsts_rfedgov <- read_excel("01_InData/Individual_RFEDGOV.xlsx") %>% 
  select(YEAR, QUARTER, ID, INDUSTRY, RFEDGOV1, RFEDGOV2, RFEDGOV3, RFEDGOV4, RFEDGOV5, RFEDGOV6) %>% 
  mutate_if(is.character, as.numeric) %>% 
  arrange(ID, YEAR, QUARTER) %>% 
  mutate(RFEDGOV2_lag_1   = lag(RFEDGOV2, 1L),
         RFEDGOV6_lag_1   = lag(RFEDGOV6, 1L),
         YEAR_lag_1    = lag(YEAR, 1L),
         QUARTER_lag_1 = lag(QUARTER, 1L)) %>% 
  filter((YEAR_lag_1 == YEAR & QUARTER_lag_1 == QUARTER - 1) |  (YEAR_lag_1 == YEAR - 1 & QUARTER_lag_1 == 4)) %>% 
  arrange(YEAR, QUARTER, ID) %>% 
  group_by(YEAR, QUARTER) %>% 
  mutate(median_prior_lag_1   = ntile(RFEDGOV6_lag_1, 2L),
         quintile_prior_lag_1 = ntile(RFEDGOV6_lag_1, 5L)) %>% 
  ungroup()

actuals_rt_rfedgov <- read_excel("01_InData/RGFQvQd.xlsx") %>% 
  mutate(year    = as.numeric(substr(DATE, 1, 4)),
         quarter = as.numeric(substr(DATE, 7, 7))) %>% 
  select(-DATE) %>%
  mutate_if(is.character, as.numeric) %>%
  mutate_at(vars(matches("RGF")), as.character) %>%
  mutate(rfedgov_actual_init = coalesce(!!! select(., matches("RGF")))) %>%
  arrange(year, quarter) %>% 
  mutate(rfedgov_actual_init_lag_1  = lag(rfedgov_actual_init, 1L),
         rfedgov_actual_init_lead_3 = lead(rfedgov_actual_init, 3L)) %>% 
  select(year, quarter, rfedgov_actual_init, rfedgov_actual_init_lag_1, rfedgov_actual_init_lead_3) %>% 
  mutate_if(is.character, as.numeric)

rfedgov_data <- spf_fcsts_rfedgov %>% 
  inner_join(actuals_rt_rfedgov, by = c("YEAR" = "year", "QUARTER" = "quarter")) %>% 
  mutate(RFEDGOV5       = winsorize_x(RFEDGOV5, cut = 0.001),
         RFEDGOV6_lag_1 = winsorize_x(RFEDGOV6_lag_1, cut = 0.001),
         RFEDGOV2_lag_1 = winsorize_x(RFEDGOV2_lag_1, cut = 0.001)) %>%
  mutate(fc_3  = (RFEDGOV5 / RFEDGOV1 - 1),
         fe_3  = (rfedgov_actual_init_lead_3 / rfedgov_actual_init_lag_1 - 1) - (RFEDGOV5 / RFEDGOV1 - 1),
         rev_3 = (RFEDGOV5 / RFEDGOV1 - 1) - (RFEDGOV6_lag_1 / RFEDGOV2_lag_1 - 1)) %>% 
  mutate(var = "rfedgov") %>% 
  group_by(YEAR, QUARTER) %>% 
  mutate(growth_prior = ntile((RFEDGOV6_lag_1 / RFEDGOV2_lag_1 - 1), 2L)) %>% 
  ungroup()

write_dta(rfedgov_data, "02_OutData/rfedgov_data.dta")

rfedgov_sumstats <- rfedgov_data %>% 
  group_by(YEAR, QUARTER) %>% 
  summarize(me_fe   = mean(fe_3, na.rm = TRUE),
            med_fe  = median(fe_3, na.rm = TRUE),
            sd_fe   = sd(fe_3, na.rm = TRUE),
            me_rev  = mean(rev_3, na.rm = TRUE),
            med_rev = median(rev_3, na.rm = TRUE),
            sd_rev  = sd(rev_3, na.rm = TRUE)) %>% 
  ungroup() %>% 
  summarize(mean_me_fe   = mean(me_fe, na.rm = TRUE),
            mean_med_fe  = mean(med_fe, na.rm = TRUE),
            mean_sd_fe   = mean(sd_fe, na.rm = TRUE),
            mean_me_rev  = mean(me_rev, na.rm = TRUE),
            mean_med_rev = mean(med_rev, na.rm = TRUE),
            mean_sd_rev  = mean(sd_rev, na.rm = TRUE)) %>% 
  mutate(var = "rfedgov")

n_rfedgov <- rfedgov_data %>% 
  filter(!is.na(fe_3),
         !is.na(rev_3),
         !is.na(median_prior_lag_1)) %>% 
  summarize(n = n())


### Real State and Local Government Consumption Expenditures ###################

spf_fcsts_rslgov <- read_excel("01_InData/Individual_RSLGOV.xlsx") %>% 
  select(YEAR, QUARTER, ID, INDUSTRY, RSLGOV1, RSLGOV2, RSLGOV3, RSLGOV4, RSLGOV5, RSLGOV6) %>% 
  mutate_if(is.character, as.numeric) %>% 
  arrange(ID, YEAR, QUARTER) %>% 
  mutate(RSLGOV2_lag_1   = lag(RSLGOV2, 1L),
         RSLGOV6_lag_1   = lag(RSLGOV6, 1L),
         YEAR_lag_1    = lag(YEAR, 1L),
         QUARTER_lag_1 = lag(QUARTER, 1L)) %>% 
  filter((YEAR_lag_1 == YEAR & QUARTER_lag_1 == QUARTER - 1) |  (YEAR_lag_1 == YEAR - 1 & QUARTER_lag_1 == 4)) %>% 
  arrange(YEAR, QUARTER, ID) %>% 
  group_by(YEAR, QUARTER) %>% 
  mutate(median_prior_lag_1   = ntile(RSLGOV6_lag_1, 2L),
         quintile_prior_lag_1 = ntile(RSLGOV6_lag_1, 5L)) %>% 
  ungroup()

actuals_rt_rslgov <- read_excel("01_InData/RGSLQvQd.xlsx") %>% 
  mutate(year    = as.numeric(substr(DATE, 1, 4)),
         quarter = as.numeric(substr(DATE, 7, 7))) %>% 
  select(-DATE) %>%
  mutate_if(is.character, as.numeric) %>%
  mutate_at(vars(matches("RGSL")), as.character) %>%
  mutate(rslgov_actual_init = coalesce(!!! select(., matches("RGSL")))) %>%
  arrange(year, quarter) %>% 
  mutate(rslgov_actual_init_lag_1  = lag(rslgov_actual_init, 1L),
         rslgov_actual_init_lead_3 = lead(rslgov_actual_init, 3L)) %>% 
  select(year, quarter, rslgov_actual_init, rslgov_actual_init_lag_1, rslgov_actual_init_lead_3) %>% 
  mutate_if(is.character, as.numeric)

rslgov_data <- spf_fcsts_rslgov %>% 
  inner_join(actuals_rt_rslgov, by = c("YEAR" = "year", "QUARTER" = "quarter")) %>% 
  mutate(RSLGOV5       = winsorize_x(RSLGOV5, cut = 0.001),
         RSLGOV6_lag_1 = winsorize_x(RSLGOV6_lag_1, cut = 0.001),
         RSLGOV2_lag_1 = winsorize_x(RSLGOV2_lag_1, cut = 0.001)) %>%
  mutate(fc_3  = (RSLGOV5 / RSLGOV1 - 1),
         fe_3  = (rslgov_actual_init_lead_3 / rslgov_actual_init_lag_1 - 1) - (RSLGOV5 / RSLGOV1 - 1),
         rev_3 = (RSLGOV5 / RSLGOV1 - 1) - (RSLGOV6_lag_1 / RSLGOV2_lag_1 - 1)) %>% 
  mutate(var = "rslgov") %>% 
  group_by(YEAR, QUARTER) %>% 
  mutate(growth_prior = ntile((RSLGOV6_lag_1 / RSLGOV2_lag_1 - 1), 2L)) %>% 
  ungroup()

write_dta(rslgov_data, "02_OutData/rslgov_data.dta")

rslgov_sumstats <- rslgov_data %>% 
  group_by(YEAR, QUARTER) %>% 
  summarize(me_fe   = mean(fe_3, na.rm = TRUE),
            med_fe  = median(fe_3, na.rm = TRUE),
            sd_fe   = sd(fe_3, na.rm = TRUE),
            me_rev  = mean(rev_3, na.rm = TRUE),
            med_rev = median(rev_3, na.rm = TRUE),
            sd_rev  = sd(rev_3, na.rm = TRUE)) %>% 
  ungroup() %>% 
  summarize(mean_me_fe   = mean(me_fe, na.rm = TRUE),
            mean_med_fe  = mean(med_fe, na.rm = TRUE),
            mean_sd_fe   = mean(sd_fe, na.rm = TRUE),
            mean_me_rev  = mean(me_rev, na.rm = TRUE),
            mean_med_rev = mean(med_rev, na.rm = TRUE),
            mean_sd_rev  = mean(sd_rev, na.rm = TRUE)) %>% 
  mutate(var = "rslgov")

n_rslgov <- rslgov_data %>% 
  filter(!is.na(fe_3),
         !is.na(rev_3),
         !is.na(median_prior_lag_1)) %>% 
  summarize(n = n())


### Housing Starts #############################################################

spf_fcsts_housing <- read_excel("01_InData/Individual_HOUSING.xlsx") %>% 
  select(YEAR, QUARTER, ID, INDUSTRY, HOUSING1, HOUSING2, HOUSING3, HOUSING4, HOUSING5, HOUSING6) %>% 
  mutate_if(is.character, as.numeric) %>% 
  arrange(ID, YEAR, QUARTER) %>% 
  mutate(HOUSING2_lag_1   = lag(HOUSING2, 1L),
         HOUSING6_lag_1   = lag(HOUSING6, 1L),
         YEAR_lag_1    = lag(YEAR, 1L),
         QUARTER_lag_1 = lag(QUARTER, 1L)) %>% 
  filter((YEAR_lag_1 == YEAR & QUARTER_lag_1 == QUARTER - 1) |  (YEAR_lag_1 == YEAR - 1 & QUARTER_lag_1 == 4)) %>% 
  arrange(YEAR, QUARTER, ID) %>% 
  group_by(YEAR, QUARTER) %>% 
  mutate(median_prior_lag_1   = ntile(HOUSING6_lag_1, 2L),
         quintile_prior_lag_1 = ntile(HOUSING6_lag_1, 5L)) %>% 
  ungroup()

actuals_rt_housing <- read_excel("01_InData/hstartsMvMd.xlsx") %>% 
  mutate(month = as.numeric(substr(DATE, 6, 7))) %>% 
  filter(month == 3 | month == 6 | month == 9 | month == 12) %>% 
  mutate(quarter = case_when(
    month ==  3 ~ 1,
    month ==  6 ~ 2,
    month ==  9 ~ 3,
    month == 12 ~ 4)) %>% 
  mutate(year = as.numeric(substr(DATE, 1, 4))) %>% 
  filter(DATE >= "1967:01") %>%
  select(-DATE) %>%
  mutate_if(is.character, as.numeric) %>%
  mutate_at(vars(matches("HSTARTS")), as.character) %>%
  mutate(housing_actual_init = coalesce(!!! select(., matches("HSTARTS")))) %>%
  arrange(year, month) %>% 
  mutate(housing_actual_init_lag_1  = lag(housing_actual_init, 1L),
         housing_actual_init_lead_3 = lead(housing_actual_init, 3L)) %>% 
  select(year, quarter, housing_actual_init, housing_actual_init_lag_1, housing_actual_init_lead_3) %>% 
  mutate_if(is.character, as.numeric)

housing_data <- spf_fcsts_housing %>% 
  inner_join(actuals_rt_housing, by = c("YEAR" = "year", "QUARTER" = "quarter")) %>% 
  mutate(HOUSING5       = winsorize_x(HOUSING5, cut = 0.001),
         HOUSING6_lag_1 = winsorize_x(HOUSING6_lag_1, cut = 0.001),
         HOUSING2_lag_1 = winsorize_x(HOUSING2_lag_1, cut = 0.001)) %>%
  mutate(fc_3  = (HOUSING5 / HOUSING1 - 1),
         fe_3  = (housing_actual_init_lead_3 / housing_actual_init_lag_1 - 1) - (HOUSING5 / HOUSING1 - 1),
         rev_3 = (HOUSING5 / HOUSING1 - 1) - (HOUSING6_lag_1 / HOUSING2_lag_1 - 1)) %>% 
  mutate(var = "housing") %>% 
  group_by(YEAR, QUARTER) %>% 
  mutate(growth_prior = ntile((HOUSING6_lag_1 / HOUSING2_lag_1 - 1), 2L)) %>% 
  ungroup()

write_dta(housing_data, "02_OutData/housing_data.dta")

housing_sumstats <- housing_data %>% 
  group_by(YEAR, QUARTER) %>% 
  summarize(me_fe   = mean(fe_3, na.rm = TRUE),
            med_fe  = median(fe_3, na.rm = TRUE),
            sd_fe   = sd(fe_3, na.rm = TRUE),
            me_rev  = mean(rev_3, na.rm = TRUE),
            med_rev = median(rev_3, na.rm = TRUE),
            sd_rev  = sd(rev_3, na.rm = TRUE)) %>% 
  ungroup() %>% 
  summarize(mean_me_fe   = mean(me_fe, na.rm = TRUE),
            mean_med_fe  = mean(med_fe, na.rm = TRUE),
            mean_sd_fe   = mean(sd_fe, na.rm = TRUE),
            mean_me_rev  = mean(me_rev, na.rm = TRUE),
            mean_med_rev = mean(med_rev, na.rm = TRUE),
            mean_sd_rev  = mean(sd_rev, na.rm = TRUE)) %>% 
  mutate(var = "housing")

n_housing <- housing_data %>% 
  filter(!is.na(fe_3),
         !is.na(rev_3),
         !is.na(median_prior_lag_1)) %>% 
  summarize(n = n())


### 3-month treasury bill rate #################################################

spf_fcsts_tbill <- read_excel("01_InData/Individual_TBILL.xlsx") %>% 
  select(YEAR, QUARTER, ID, INDUSTRY, TBILL1, TBILL2, TBILL3, TBILL4, TBILL5, TBILL6) %>% 
  mutate_if(is.character, as.numeric) %>% 
  arrange(ID, YEAR, QUARTER) %>% 
  mutate(TBILL6_lag_1   = lag(TBILL6, 1L),
         YEAR_lag_1     = lag(YEAR, 1L),
         QUARTER_lag_1  = lag(QUARTER, 1L)) %>% 
  filter((YEAR_lag_1 == YEAR & QUARTER_lag_1 == QUARTER - 1) |  (YEAR_lag_1 == YEAR - 1 & QUARTER_lag_1 == 4)) %>% 
  arrange(YEAR, QUARTER, ID) %>% 
  group_by(YEAR, QUARTER) %>% 
  mutate(median_prior_lag_1   = ntile(TBILL6_lag_1, 2L),
         quintile_prior_lag_1 = ntile(TBILL6_lag_1, 5L)) %>% 
  ungroup()

actuals_tbill <- read_excel("01_InData/Individual_TBILL.xlsx") %>% 
  select(YEAR, QUARTER, ID, INDUSTRY, TBILL1, TBILL2, TBILL3, TBILL4, TBILL5, TBILL6) %>% 
  mutate_if(is.character, as.numeric) %>% 
  arrange(ID, YEAR, QUARTER) %>% 
  mutate(TBILL1_lead_4  = lead(TBILL1, 4L),
         YEAR_lead_4    = lead(YEAR, 4L),
         QUARTER_lead_4 = lead(QUARTER, 4L)) %>% 
  filter(YEAR_lead_4 == YEAR + 1 & QUARTER_lead_4 == QUARTER) %>% 
  select(YEAR, QUARTER, ID, TBILL1_lead_4)

tbill_data <- spf_fcsts_tbill %>% 
  inner_join(actuals_tbill, by = c("YEAR", "QUARTER", "ID")) %>% 
  mutate(TBILL5 = winsorize_x(TBILL5, cut = 0.001)) %>%
  mutate(fc_3   = TBILL5,
         fe_3   = TBILL1_lead_4 - TBILL5,
         rev_3  = TBILL5 - TBILL6_lag_1) %>% 
  mutate(var = "tbill") %>% 
  group_by(YEAR, QUARTER) %>% 
  mutate(growth_prior = ntile(TBILL6_lag_1, 2L)) %>% 
  ungroup()

write_dta(tbill_data, "02_OutData/tbill_data.dta")

tbill_sumstats <- tbill_data %>% 
  group_by(YEAR, QUARTER) %>% 
  summarize(me_fe   = mean(fe_3, na.rm = TRUE),
            med_fe  = median(fe_3, na.rm = TRUE),
            sd_fe   = sd(fe_3, na.rm = TRUE),
            me_rev  = mean(rev_3, na.rm = TRUE),
            med_rev = median(rev_3, na.rm = TRUE),
            sd_rev  = sd(rev_3, na.rm = TRUE)) %>% 
  ungroup() %>% 
  summarize(mean_me_fe   = mean(me_fe, na.rm = TRUE),
            mean_med_fe  = mean(med_fe, na.rm = TRUE),
            mean_sd_fe   = mean(sd_fe, na.rm = TRUE),
            mean_me_rev  = mean(me_rev, na.rm = TRUE),
            mean_med_rev = mean(med_rev, na.rm = TRUE),
            mean_sd_rev  = mean(sd_rev, na.rm = TRUE)) %>%   mutate(var = "tbill")

n_tbill <- tbill_data %>% 
  filter(!is.na(fe_3),
         !is.na(rev_3),
         !is.na(median_prior_lag_1)) %>% 
  summarize(n = n())


### 10-year treasury bond rate #################################################

spf_fcsts_tbond <- read_excel("01_InData/Individual_TBOND.xlsx") %>% 
  select(YEAR, QUARTER, ID, INDUSTRY, TBOND1, TBOND2, TBOND3, TBOND4, TBOND5, TBOND6) %>% 
  mutate_if(is.character, as.numeric) %>% 
  arrange(ID, YEAR, QUARTER) %>% 
  mutate(TBOND6_lag_1   = lag(TBOND6, 1L),
         YEAR_lag_1     = lag(YEAR, 1L),
         QUARTER_lag_1  = lag(QUARTER, 1L)) %>% 
  filter((YEAR_lag_1 == YEAR & QUARTER_lag_1 == QUARTER - 1) |  (YEAR_lag_1 == YEAR - 1 & QUARTER_lag_1 == 4)) %>% 
  arrange(YEAR, QUARTER, ID) %>% 
  group_by(YEAR, QUARTER) %>% 
  mutate(median_prior_lag_1   = ntile(TBOND6_lag_1, 2L),
         quintile_prior_lag_1 = ntile(TBOND6_lag_1, 5L)) %>% 
  ungroup()

actuals_tbond <- read_excel("01_InData/Individual_TBOND.xlsx") %>% 
  select(YEAR, QUARTER, ID, INDUSTRY, TBOND1, TBOND2, TBOND3, TBOND4, TBOND5, TBOND6) %>% 
  mutate_if(is.character, as.numeric) %>% 
  arrange(ID, YEAR, QUARTER) %>% 
  mutate(TBOND1_lead_4  = lead(TBOND1, 4L),
         YEAR_lead_4    = lead(YEAR, 4L),
         QUARTER_lead_4 = lead(QUARTER, 4L)) %>% 
  filter(YEAR_lead_4 == YEAR + 1 & QUARTER_lead_4 == QUARTER) %>% 
  select(YEAR, QUARTER, ID, TBOND1_lead_4)

tbond_data <- spf_fcsts_tbond %>% 
  inner_join(actuals_tbond, by = c("YEAR", "QUARTER", "ID")) %>% 
  mutate(TBOND5 = winsorize_x(TBOND5, cut = 0.001)) %>%
  mutate(fc_3   = TBOND5,
         fe_3   = TBOND1_lead_4 - TBOND5,
         rev_3  = TBOND5 - TBOND6_lag_1) %>% 
  mutate(var = "tbond") %>% 
  group_by(YEAR, QUARTER) %>% 
  mutate(growth_prior = ntile(TBOND6_lag_1, 2L)) %>% 
  ungroup()

write_dta(tbond_data, "02_OutData/tbond_data.dta")

tbond_sumstats <- tbond_data %>% 
  group_by(YEAR, QUARTER) %>% 
  summarize(me_fe   = mean(fe_3, na.rm = TRUE),
            med_fe  = median(fe_3, na.rm = TRUE),
            sd_fe   = sd(fe_3, na.rm = TRUE),
            me_rev  = mean(rev_3, na.rm = TRUE),
            med_rev = median(rev_3, na.rm = TRUE),
            sd_rev  = sd(rev_3, na.rm = TRUE)) %>% 
  ungroup() %>% 
  summarize(mean_me_fe   = mean(me_fe, na.rm = TRUE),
            mean_med_fe  = mean(med_fe, na.rm = TRUE),
            mean_sd_fe   = mean(sd_fe, na.rm = TRUE),
            mean_me_rev  = mean(me_rev, na.rm = TRUE),
            mean_med_rev = mean(med_rev, na.rm = TRUE),
            mean_sd_rev  = mean(sd_rev, na.rm = TRUE)) %>% 
  mutate(var = "tbond")

n_tbond <- tbond_data %>% 
  filter(!is.na(fe_3),
         !is.na(rev_3),
         !is.na(median_prior_lag_1)) %>% 
  summarize(n = n())


### Moody's Aaa Corporate Bond Yield ###########################################

spf_fcsts_bond <- read_excel("01_InData/Individual_BOND.xlsx") %>% 
  select(YEAR, QUARTER, ID, INDUSTRY, BOND1, BOND2, BOND3, BOND4, BOND5, BOND6) %>% 
  mutate_if(is.character, as.numeric) %>% 
  arrange(ID, YEAR, QUARTER) %>% 
  mutate(BOND6_lag_1   = lag(BOND6, 1L),
         YEAR_lag_1     = lag(YEAR, 1L),
         QUARTER_lag_1  = lag(QUARTER, 1L)) %>% 
  filter((YEAR_lag_1 == YEAR & QUARTER_lag_1 == QUARTER - 1) |  (YEAR_lag_1 == YEAR - 1 & QUARTER_lag_1 == 4)) %>% 
  arrange(YEAR, QUARTER, ID) %>% 
  group_by(YEAR, QUARTER) %>% 
  mutate(median_prior_lag_1   = ntile(BOND6_lag_1, 2L),
         quintile_prior_lag_1 = ntile(BOND6_lag_1, 5L)) %>% 
  ungroup()

actuals_bond <- read_excel("01_InData/fred_AAA.xls") %>% 
  janitor::row_to_names(row_number = 10) %>% 
  mutate_if(is.character, as.numeric) %>% 
  mutate(date = as.Date(observation_date, origin="1900-01-01")) %>% 
  mutate(year  = lubridate::year(date),
         month = lubridate::month(date)) %>% 
  filter(month == 1 | month == 4 | month == 7 | month == 10) %>% 
  mutate(quarter = case_when(
    month ==  1 ~ 4,
    month ==  4 ~ 1,
    month ==  7 ~ 2,
    month == 10 ~ 3)) %>% 
  mutate(year = case_when(
    quarter == 4 ~ year - 1,
    TRUE         ~ year)) %>% 
  select(year, quarter, AAA) %>% 
  arrange(year, quarter) %>% 
  mutate(AAA_lead_3 = lead(AAA, 3L)) %>% 
  select(year, quarter, AAA_lead_3)

bond_data <- spf_fcsts_bond %>% 
  inner_join(actuals_bond, by = c("YEAR" = "year", "QUARTER" = "quarter")) %>% 
  mutate(BOND5 = winsorize_x(BOND5, cut = 0.001)) %>%
  mutate(fc_3   = BOND5,
         fe_3   = AAA_lead_3 - BOND5,
         rev_3  = BOND5 - BOND6_lag_1) %>% 
  mutate(var = "bond") %>% 
  group_by(YEAR, QUARTER) %>% 
  mutate(growth_prior = ntile(BOND6_lag_1, 2L)) %>% 
  ungroup()

write_dta(bond_data, "02_OutData/bond_data.dta")

bond_sumstats <- bond_data %>% 
  group_by(YEAR, QUARTER) %>% 
  summarize(me_fe   = mean(fe_3, na.rm = TRUE),
            med_fe  = median(fe_3, na.rm = TRUE),
            sd_fe   = sd(fe_3, na.rm = TRUE),
            me_rev  = mean(rev_3, na.rm = TRUE),
            med_rev = median(rev_3, na.rm = TRUE),
            sd_rev  = sd(rev_3, na.rm = TRUE)) %>% 
  ungroup() %>% 
  summarize(mean_me_fe   = mean(me_fe, na.rm = TRUE),
            mean_med_fe  = mean(med_fe, na.rm = TRUE),
            mean_sd_fe   = mean(sd_fe, na.rm = TRUE),
            mean_me_rev  = mean(me_rev, na.rm = TRUE),
            mean_med_rev = mean(med_rev, na.rm = TRUE),
            mean_sd_rev  = mean(sd_rev, na.rm = TRUE)) %>% 
  mutate(var = "bond")

n_bond <- bond_data %>% 
  filter(!is.na(fe_3),
         !is.na(rev_3),
         !is.na(median_prior_lag_1)) %>% 
  summarize(n = n())



################################################################################
### summary statistics #########################################################
################################################################################

sumstats <- rbind(cpi_sumstats, housing_sumstats, ngdp_sumstats, pgdp_sumstats,
                  indprod_sumstats,
                  rconsum_sumstats, rfedgov_sumstats, rgdp_sumstats, rnresin_sumstats,
                  rresinv_sumstats, rslgov_sumstats, unemp_sumstats, tbill_sumstats,
                  tbond_sumstats, bond_sumstats) %>% 
  mutate_if(is.numeric, round, digits = 3)

write.csv(sumstats, "02_OutData/sumstats.csv")





################################################################################
### Median prior ###############################################################
################################################################################

### forecaster-by-forecaster regressions #######################################

var_data <- rbind(cpi_data %>% select(ID, median_prior_lag_1, quintile_prior_lag_1, growth_prior, fe_3, rev_3, var, YEAR, QUARTER),
                  housing_data %>% select(ID, median_prior_lag_1, quintile_prior_lag_1, growth_prior, fe_3, rev_3, var, YEAR, QUARTER),
                  indprod_data %>% select(ID, median_prior_lag_1, quintile_prior_lag_1, growth_prior, fe_3, rev_3, var, YEAR, QUARTER),
                  ngdp_data %>% select(ID, median_prior_lag_1, quintile_prior_lag_1, growth_prior, fe_3, rev_3, var, YEAR, QUARTER),
                  pgdp_data %>% select(ID, median_prior_lag_1, quintile_prior_lag_1, growth_prior, fe_3, rev_3, var, YEAR, QUARTER),
                  rconsum_data %>% select(ID, median_prior_lag_1, quintile_prior_lag_1, growth_prior, fe_3, rev_3, var, YEAR, QUARTER),
                  rfedgov_data %>% select(ID, median_prior_lag_1, quintile_prior_lag_1, growth_prior, fe_3, rev_3, var, YEAR, QUARTER),
                  rgdp_data %>% select(ID, median_prior_lag_1, quintile_prior_lag_1, growth_prior, fe_3, rev_3, var, YEAR, QUARTER),
                  rnresin_data %>% select(ID, median_prior_lag_1, quintile_prior_lag_1, growth_prior, fe_3, rev_3, var, YEAR, QUARTER),
                  rresinv_data %>% select(ID, median_prior_lag_1, quintile_prior_lag_1, growth_prior, fe_3, rev_3, var, YEAR, QUARTER),
                  rslgov_data %>% select(ID, median_prior_lag_1, quintile_prior_lag_1, growth_prior, fe_3, rev_3, var, YEAR, QUARTER),
                  unemp_data %>% select(ID, median_prior_lag_1, quintile_prior_lag_1, growth_prior, fe_3, rev_3, var, YEAR, QUARTER),
                  tbill_data %>% select(ID, median_prior_lag_1, quintile_prior_lag_1, growth_prior, fe_3, rev_3, var, YEAR, QUARTER),
                  tbond_data %>% select(ID, median_prior_lag_1, quintile_prior_lag_1, growth_prior, fe_3, rev_3, var, YEAR, QUARTER),
                  bond_data %>% select(ID, median_prior_lag_1, quintile_prior_lag_1, growth_prior, fe_3, rev_3, var, YEAR, QUARTER)) %>% 
  group_by(YEAR, QUARTER) %>% 
  mutate(time = cur_group_id()) %>% 
  ungroup() %>% 
  select(-YEAR, -QUARTER)

write_dta(var_data, "02_OutData/macro_var_data.dta")

var_list <- list("housing", "indprod", "ngdp", "pgdp", "rconsum", "rfedgov",
                 "rgdp", "rnresin", "rresinv", "rslgov", "unemp", "cpi", "tbill",
                 "tbond", "bond")

PC_coeff_list  <- list()
PIC_coeff_list <- list()

for (j in var_list){
  
  # prior-consistent
  PC_IDs <- as.list(t(var_data %>% 
    filter((median_prior_lag_1 == 2 & rev_3 > 0) | (median_prior_lag_1 == 1 & rev_3 < 0)) %>%
    filter(!is.na(rev_3),
           !is.na(fe_3)) %>% 
    filter(var == j) %>% 
    group_by(ID) %>% 
    summarize(n = n()) %>% 
    ungroup() %>% 
    filter(n >= 10) %>% 
    select(-n)))
  
  PC_coeffs <- list()
  
  for (i in PC_IDs) {
    
    reg_PC_data <- var_data %>% 
      filter((median_prior_lag_1 == 2 & rev_3 > 0) | (median_prior_lag_1 == 1 & rev_3 < 0)) %>%
      filter(!is.na(rev_3),
             !is.na(fe_3)) %>% 
      filter(ID == i) %>% 
      filter(var == j)
    
    lm_reg_PC <- lm(fe_3 ~ rev_3, data = reg_PC_data)
    
    coeff_PC <- tibble(coeff = lm_reg_PC$coefficients[2]) %>% 
      mutate(ID = i)
    
    PC_coeffs[[i]] <- coeff_PC
    
  }
  
  PC_coeff_out <- bind_rows(PC_coeffs) %>% 
    summarize(median_coeff = median(coeff)) %>% 
    mutate(var = j) %>% 
    mutate(type = "PC")
  
  PC_coeff_list[[j]] <- PC_coeff_out
  
  # prior-inconsistent
  PIC_IDs <- as.list(t(var_data %>% 
    filter((median_prior_lag_1 == 2 & rev_3 < 0) | (median_prior_lag_1 == 1 & rev_3 > 0)) %>%
    filter(!is.na(rev_3),
           !is.na(fe_3)) %>% 
    filter(var == j) %>% 
    group_by(ID) %>% 
    summarize(n = n()) %>% 
    ungroup() %>% 
    filter(n >= 10) %>% 
    select(-n)))
  
  PIC_coeffs <- list()
  
  for (i in PIC_IDs) {
    
    reg_PIC_data <- var_data %>% 
      filter((median_prior_lag_1 == 2 & rev_3 < 0) | (median_prior_lag_1 == 1 & rev_3 > 0)) %>%
      filter(!is.na(rev_3),
             !is.na(fe_3)) %>% 
      filter(ID == i) %>% 
      filter(var == j)
    
    lm_reg_PIC <- lm(fe_3 ~ rev_3, data = reg_PIC_data)
    
    coeff_PIC <- tibble(coeff = lm_reg_PIC$coefficients[2]) %>% 
      mutate(ID  = i)
    
    PIC_coeffs[[i]] <- coeff_PIC
    
  }
  
  PIC_coeff_out <- bind_rows(PIC_coeffs) %>% 
    summarize(median_coeff = median(coeff)) %>% 
    mutate(var = j) %>% 
    mutate(type = "PIC")
  
  PIC_coeff_list[[j]] <- PIC_coeff_out
  
}

PC_coeff_list_out  <- bind_rows(PC_coeff_list)
PIC_coeff_list_out <- bind_rows(PIC_coeff_list)

coeff_list_out <- bind_rows(PC_coeff_list_out, PIC_coeff_list_out)

ind_plot <- coeff_list_out %>% 
  mutate(var = fct_relevel(var, "ngdp", "rgdp", "pgdp", "cpi", "rconsum", "indprod",
                           "rnresin", "rresinv", "rfedgov", "rslgov", "housing",
                           "unemp", "tbill", "tbond", "bond")) %>% 
  ggplot(aes(x = var, y = median_coeff)) + 
  geom_point(aes(shape = type)) +  ylim(-2, 1.5) +
  labs(x = "Forecasted Variables", y = "Median Coefficent Estimate", color = "Signal", shape = "Signal")
ind_plot

ggsave(height = 4, width = 8.5, "02_OutData/Figure1B.eps")

write.csv(coeff_list_out, "02_OutData/indiv_regs_out.csv")



### pooled regressions (interacted) ############################################

var_list <- list("housing", "indprod", "ngdp", "pgdp", "rconsum", "rfedgov",
                 "rgdp", "rnresin", "rresinv", "rslgov", "unemp", "cpi", "tbill",
                 "tbond", "bond")

coeff_list  <- list()

for (j in var_list){
  
  reg_data <- var_data %>% 
    mutate(pc = case_when(
      (median_prior_lag_1 == 2 & rev_3 > 0) | (median_prior_lag_1 == 1 & rev_3 < 0) ~ 1,
      TRUE                                                                          ~ 0),
           pc_rev_3 = pc * rev_3) %>% 
        filter(!is.na(rev_3),
           !is.na(fe_3)) %>%
    filter(var == j)
  
  lm_reg <- as.data.frame(t(broom::tidy(felm(fe_3 ~ rev_3 + pc + pc_rev_3 | 0 | 0 | ID + time, data = reg_data)) %>% 
    select(-statistic) %>% 
    mutate_if(is.numeric, round, digits = 10) %>% 
    mutate(estimate = case_when(
      p.value < 0.01 ~ paste0(round(estimate, 3), "***"),
      p.value < 0.05 ~ paste0(round(estimate, 3), "**"),
      p.value < 0.1  ~ paste0(round(estimate, 3), "*"),
      TRUE           ~ paste0(round(estimate, 3), ""))))) %>%
    janitor::row_to_names(row_number = 1) %>% 
    mutate(n = row_number()) %>% 
    filter(n == 1) %>% 
    select(-n) %>% 
    mutate(var = j)
  
  coeff_list[[j]] <- lm_reg
  
}

coeff_list_out  <- bind_rows(coeff_list)

write.csv(coeff_list_out, "02_OutData/pooled_regs_out.csv")


### pooled regressions - for graph #############################################

var_list <- list("housing", "indprod", "ngdp", "pgdp", "rconsum", "rfedgov",
                 "rgdp", "rnresin", "rresinv", "rslgov", "unemp", "cpi", "tbill",
                 "tbond", "bond")

PC_coeff_list  <- list()
PIC_coeff_list <- list()

for (j in var_list){
  
  # prior-consistent
  reg_PC_data <- var_data %>% 
    filter((median_prior_lag_1 == 2 & rev_3 > 0) | (median_prior_lag_1 == 1 & rev_3 < 0)) %>%
    filter(!is.na(rev_3),
           !is.na(fe_3)) %>%
    filter(var == j)
  
  lm_reg_PC <- broom::tidy(felm(fe_3 ~ rev_3 | 0 | 0 | ID + time, data = reg_PC_data)) %>% 
    filter(term == "rev_3")
  
  coeff_PC <- tibble(coeff = lm_reg_PC$estimate,
                     se    = lm_reg_PC$std.error,
                     pval  = lm_reg_PC$p.value) %>% 
    mutate(var = j) %>% 
    mutate(type = "PC")
  
  PC_coeff_list[[j]] <- coeff_PC
  
  # prior-inconsistent
  reg_PIC_data <- var_data %>% 
    filter((median_prior_lag_1 == 2 & rev_3 < 0) | (median_prior_lag_1 == 1 & rev_3 > 0)) %>%
    filter(!is.na(rev_3),
           !is.na(fe_3)) %>%
    filter(var == j)
  
  lm_reg_PIC <- broom::tidy(felm(fe_3 ~ rev_3 | 0 | 0 | ID + time, data = reg_PIC_data)) %>% 
    filter(term == "rev_3")
  
  coeff_PIC <- tibble(coeff = lm_reg_PIC$estimate,
                      se    = lm_reg_PIC$std.error,
                      pval  = lm_reg_PIC$p.value) %>% 
    mutate(var = j) %>% 
    mutate(type = "PIC")
  
  PIC_coeff_list[[j]] <- coeff_PIC
  
}

PC_coeff_list_out  <- bind_rows(PC_coeff_list)
PIC_coeff_list_out <- bind_rows(PIC_coeff_list)

coeff_list_out <- bind_rows(PC_coeff_list_out, PIC_coeff_list_out) %>% 
  mutate(pos_conf_inv = coeff + 1.96 * se,
         neg_conf_inv = coeff - 1.96 * se)

ind_plot <- coeff_list_out %>% 
  mutate(var = fct_relevel(var, "ngdp", "rgdp", "pgdp", "cpi", "rconsum", "indprod",
                           "rnresin", "rresinv", "rfedgov", "rslgov", "housing",
                           "unemp", "tbill", "tbond", "bond")) %>% 
  ggplot(aes(x = var, y = coeff, shape = type)) + 
  geom_point() +  ylim(-2, 1.5) + 
  labs(x = "Forecasted Variables", y = "Coefficent Estimate", color = "Signal", shape = "Signal")
ind_plot

ggsave(height = 4, width = 8.5, "02_OutData/Figure1A.eps")



################################################################################
### Quintile prior #############################################################
################################################################################

### forecaster-by-forecaster regressions #######################################

var_data <- rbind(cpi_data %>% select(ID, median_prior_lag_1, quintile_prior_lag_1, growth_prior, fe_3, rev_3, var, YEAR, QUARTER),
                  housing_data %>% select(ID, median_prior_lag_1, quintile_prior_lag_1, growth_prior, fe_3, rev_3, var, YEAR, QUARTER),
                  indprod_data %>% select(ID, median_prior_lag_1, quintile_prior_lag_1, growth_prior, fe_3, rev_3, var, YEAR, QUARTER),
                  ngdp_data %>% select(ID, median_prior_lag_1, quintile_prior_lag_1, growth_prior, fe_3, rev_3, var, YEAR, QUARTER),
                  pgdp_data %>% select(ID, median_prior_lag_1, quintile_prior_lag_1, growth_prior, fe_3, rev_3, var, YEAR, QUARTER),
                  rconsum_data %>% select(ID, median_prior_lag_1, quintile_prior_lag_1, growth_prior, fe_3, rev_3, var, YEAR, QUARTER),
                  rfedgov_data %>% select(ID, median_prior_lag_1, quintile_prior_lag_1, growth_prior, fe_3, rev_3, var, YEAR, QUARTER),
                  rgdp_data %>% select(ID, median_prior_lag_1, quintile_prior_lag_1, growth_prior, fe_3, rev_3, var, YEAR, QUARTER),
                  rnresin_data %>% select(ID, median_prior_lag_1, quintile_prior_lag_1, growth_prior, fe_3, rev_3, var, YEAR, QUARTER),
                  rresinv_data %>% select(ID, median_prior_lag_1, quintile_prior_lag_1, growth_prior, fe_3, rev_3, var, YEAR, QUARTER),
                  rslgov_data %>% select(ID, median_prior_lag_1, quintile_prior_lag_1, growth_prior, fe_3, rev_3, var, YEAR, QUARTER),
                  unemp_data %>% select(ID, median_prior_lag_1, quintile_prior_lag_1, growth_prior, fe_3, rev_3, var, YEAR, QUARTER),
                  tbill_data %>% select(ID, median_prior_lag_1, quintile_prior_lag_1, growth_prior, fe_3, rev_3, var, YEAR, QUARTER),
                  tbond_data %>% select(ID, median_prior_lag_1, quintile_prior_lag_1, growth_prior, fe_3, rev_3, var, YEAR, QUARTER),
                  bond_data %>% select(ID, median_prior_lag_1, quintile_prior_lag_1, growth_prior, fe_3, rev_3, var, YEAR, QUARTER)) %>% 
  group_by(YEAR, QUARTER) %>% 
  mutate(time = cur_group_id()) %>% 
  ungroup() %>% 
  select(-YEAR, -QUARTER)

write_dta(var_data, "02_OutData/macro_var_data.dta")

var_list <- list("housing", "indprod", "ngdp", "pgdp", "rconsum", "rfedgov",
                 "rgdp", "rnresin", "rresinv", "rslgov", "unemp", "cpi", "tbill",
                 "tbond", "bond")

PC_coeff_list  <- list()
PIC_coeff_list <- list()

for (j in var_list){
  
  # prior-consistent
  PC_IDs <- as.list(t(var_data %>% 
                        filter((quintile_prior_lag_1 == 5 & rev_3 > 0) | (quintile_prior_lag_1 == 1 & rev_3 < 0)) %>%
                        filter(quintile_prior_lag_1 == 5 | quintile_prior_lag_1 == 1) %>%
                        filter(!is.na(rev_3),
                               !is.na(fe_3)) %>% 
                        filter(var == j) %>% 
                        group_by(ID) %>% 
                        summarize(n = n()) %>% 
                        ungroup() %>% 
                        filter(n >= 10) %>% 
                        select(-n)))
  
  PC_coeffs <- list()
  
  for (i in PC_IDs) {
    
    reg_PC_data <- var_data %>% 
      filter((quintile_prior_lag_1 == 5 & rev_3 > 0) | (quintile_prior_lag_1 == 1 & rev_3 < 0)) %>%
      filter(quintile_prior_lag_1 == 5 | quintile_prior_lag_1 == 1) %>%
      filter(!is.na(rev_3),
             !is.na(fe_3)) %>% 
      filter(ID == i) %>% 
      filter(var == j)
    
    lm_reg_PC <- lm(fe_3 ~ rev_3, data = reg_PC_data)
    
    coeff_PC <- tibble(coeff = lm_reg_PC$coefficients[2]) %>% 
      mutate(ID = i)
    
    PC_coeffs[[i]] <- coeff_PC
    
  }
  
  PC_coeff_out <- bind_rows(PC_coeffs) %>% 
    summarize(median_coeff = median(coeff)) %>% 
    mutate(var = j) %>% 
    mutate(type = "PC")
  
  PC_coeff_list[[j]] <- PC_coeff_out
  
  # prior-inconsistent
  PIC_IDs <- as.list(t(var_data %>% 
                         filter((quintile_prior_lag_1 == 5 & rev_3 < 0) | (quintile_prior_lag_1 == 1 & rev_3 > 0)) %>%
                         filter(quintile_prior_lag_1 == 5 | quintile_prior_lag_1 == 1) %>%
                         filter(!is.na(rev_3),
                                !is.na(fe_3)) %>% 
                         filter(var == j) %>% 
                         group_by(ID) %>% 
                         summarize(n = n()) %>% 
                         ungroup() %>% 
                         filter(n >= 10) %>% 
                         select(-n)))
  
  PIC_coeffs <- list()
  
  for (i in PIC_IDs) {
    
    reg_PIC_data <- var_data %>% 
      filter((quintile_prior_lag_1 == 5 & rev_3 < 0) | (quintile_prior_lag_1 == 1 & rev_3 > 0)) %>%
      filter(quintile_prior_lag_1 == 5 | quintile_prior_lag_1 == 1) %>%
      filter(!is.na(rev_3),
             !is.na(fe_3)) %>% 
      filter(ID == i) %>% 
      filter(var == j)
    
    lm_reg_PIC <- lm(fe_3 ~ rev_3, data = reg_PIC_data)
    
    coeff_PIC <- tibble(coeff = lm_reg_PIC$coefficients[2]) %>% 
      mutate(ID  = i)
    
    PIC_coeffs[[i]] <- coeff_PIC
    
  }
  
  PIC_coeff_out <- bind_rows(PIC_coeffs) %>% 
    summarize(median_coeff = median(coeff)) %>% 
    mutate(var = j) %>% 
    mutate(type = "PIC")
  
  PIC_coeff_list[[j]] <- PIC_coeff_out
  
}

PC_coeff_list_out  <- bind_rows(PC_coeff_list)
PIC_coeff_list_out <- bind_rows(PIC_coeff_list)

coeff_list_out <- bind_rows(PC_coeff_list_out, PIC_coeff_list_out)

write.csv(coeff_list_out, "02_OutData/indiv_regs_out_quintiles.csv")



### pooled regressions (interacted) ############################################

var_list <- list("housing", "indprod", "ngdp", "pgdp", "rconsum", "rfedgov",
                 "rgdp", "rnresin", "rresinv", "rslgov", "unemp", "cpi", "tbill",
                 "tbond", "bond")

coeff_list  <- list()

for (j in var_list){
  
  reg_data <- var_data %>% 
    mutate(pc = case_when(
      (quintile_prior_lag_1 == 5 & rev_3 > 0) | (quintile_prior_lag_1 == 1 & rev_3 < 0) ~ 1,
      TRUE                                                                          ~ 0),
      pc_rev_3 = pc * rev_3) %>% 
    filter(quintile_prior_lag_1 == 1 | quintile_prior_lag_1 == 5) %>%
    filter(!is.na(rev_3),
           !is.na(fe_3)) %>%
    filter(var == j)
  
  lm_reg <- as.data.frame(t(broom::tidy(felm(fe_3 ~ rev_3 + pc + pc_rev_3 | 0 | 0 | ID + time, data = reg_data)) %>% 
                              select(-statistic) %>% 
                              mutate_if(is.numeric, round, digits = 10) %>% 
                              mutate(estimate = case_when(
                                p.value < 0.01 ~ paste0(round(estimate, 3), "***"),
                                p.value < 0.05 ~ paste0(round(estimate, 3), "**"),
                                p.value < 0.1  ~ paste0(round(estimate, 3), "*"),
                                TRUE           ~ paste0(round(estimate, 3), ""))))) %>%
    janitor::row_to_names(row_number = 1) %>% 
    mutate(n = row_number()) %>% 
    filter(n == 1) %>% 
    select(-n) %>% 
    mutate(var = j)
  
  coeff_list[[j]] <- lm_reg
  
}

coeff_list_out  <- bind_rows(coeff_list)

write.csv(coeff_list_out, "02_OutData/pooled_regs_out_quintiles.csv")



################################################################################
### de-biasing the forecast ####################################################
################################################################################


var_data <- rbind(cpi_data %>% select(ID, median_prior_lag_1, quintile_prior_lag_1, growth_prior, fe_3, rev_3, var, YEAR, QUARTER),
                  housing_data %>% select(ID, median_prior_lag_1, quintile_prior_lag_1, growth_prior, fe_3, rev_3, var, YEAR, QUARTER),
                  indprod_data %>% select(ID, median_prior_lag_1, quintile_prior_lag_1, growth_prior, fe_3, rev_3, var, YEAR, QUARTER),
                  ngdp_data %>% select(ID, median_prior_lag_1, quintile_prior_lag_1, growth_prior, fe_3, rev_3, var, YEAR, QUARTER),
                  pgdp_data %>% select(ID, median_prior_lag_1, quintile_prior_lag_1, growth_prior, fe_3, rev_3, var, YEAR, QUARTER),
                  rconsum_data %>% select(ID, median_prior_lag_1, quintile_prior_lag_1, growth_prior, fe_3, rev_3, var, YEAR, QUARTER),
                  rfedgov_data %>% select(ID, median_prior_lag_1, quintile_prior_lag_1, growth_prior, fe_3, rev_3, var, YEAR, QUARTER),
                  rgdp_data %>% select(ID, median_prior_lag_1, quintile_prior_lag_1, growth_prior, fe_3, rev_3, var, YEAR, QUARTER),
                  rnresin_data %>% select(ID, median_prior_lag_1, quintile_prior_lag_1, growth_prior, fe_3, rev_3, var, YEAR, QUARTER),
                  rresinv_data %>% select(ID, median_prior_lag_1, quintile_prior_lag_1, growth_prior, fe_3, rev_3, var, YEAR, QUARTER),
                  rslgov_data %>% select(ID, median_prior_lag_1, quintile_prior_lag_1, growth_prior, fe_3, rev_3, var, YEAR, QUARTER),
                  unemp_data %>% select(ID, median_prior_lag_1, quintile_prior_lag_1, growth_prior, fe_3, rev_3, var, YEAR, QUARTER),
                  tbill_data %>% select(ID, median_prior_lag_1, quintile_prior_lag_1, growth_prior, fe_3, rev_3, var, YEAR, QUARTER),
                  tbond_data %>% select(ID, median_prior_lag_1, quintile_prior_lag_1, growth_prior, fe_3, rev_3, var, YEAR, QUARTER),
                  bond_data %>% select(ID, median_prior_lag_1, quintile_prior_lag_1, growth_prior, fe_3, rev_3, var, YEAR, QUARTER)) %>% 
  group_by(YEAR, QUARTER) %>% 
  mutate(time = cur_group_id()) %>% 
  ungroup() %>% 
  select(-YEAR, -QUARTER) %>% 
  mutate(pc = case_when(
    (quintile_prior_lag_1 == 5 & rev_3 > 0) | (quintile_prior_lag_1 == 1 & rev_3 < 0) ~ 1,
    TRUE                                                                          ~ 0),
    neg_rev = rev_3 < 0)



fe_all_data <- var_data %>% 
  filter(!is.na(fe_3),
         !is.na(rev_3)) %>% 
  group_by(time, var) %>% 
  summarize(mean_abs_fe = mean(abs(fe_3))) %>% 
  ungroup() %>% 
  mutate(group = "all")

fe_pic_data <- var_data %>% 
  filter(!is.na(fe_3),
         !is.na(rev_3)) %>% 
  filter(pc == 0) %>% 
  group_by(time, var) %>% 
  summarize(mean_abs_fe_pic = mean(abs(fe_3))) %>% 
  ungroup() %>% 
  mutate(group = "pic")

fe_pc_data <- var_data %>% 
  filter(!is.na(fe_3),
         !is.na(rev_3)) %>% 
  filter(pc == 1) %>% 
  group_by(time, var) %>% 
  summarize(mean_abs_fe_pc = mean(abs(fe_3))) %>% 
  ungroup() %>% 
  mutate(group = "pc")

fe_data <- fe_all_data %>% 
  select(-group) %>%
  inner_join(fe_pic_data %>% select(-group), by = c("time", "var")) %>%
  inner_join(fe_pc_data %>% select(-group), by = c("time", "var"))


fe_summary <- fe_data %>%
  group_by(var) %>%
  summarize(mean_fe     = mean(mean_abs_fe),
            mean_fe_pic = mean(mean_abs_fe_pic),
            mean_diff   = mean(mean_abs_fe_pic - mean_abs_fe),
            sd_diff     = sd(mean_abs_fe_pic - mean_abs_fe),
            n           = n()
            ) %>%
  ungroup() %>%
  mutate(diff_tval = mean_diff / (sd_diff / sqrt(n))) %>% 
  select(var, mean_fe, mean_fe_pic, mean_diff, diff_tval) %>% 
  mutate_if(is.numeric, round, digits = 4)


var_list <- list("housing", "indprod", "ngdp", "pgdp", "rconsum", "rfedgov",
                 "rgdp", "rnresin", "rresinv", "rslgov", "unemp", "cpi", "tbill",
                 "tbond", "bond")


fe_list       <- list()
fe_pic_list   <- list()
fe_diff_list  <- list()

for (i in var_list) {
  
  fe_reg_data <- fe_data %>% 
    mutate(diff = mean_abs_fe_pic - mean_abs_fe) %>% 
    filter(var == i)

  lm_mean_fe <- estimatr::lm_robust(mean_abs_fe ~ 1, clusters = time, data = fe_reg_data)
  
  out_mean_fe <- tibble(coeff = lm_mean_fe$coefficients[1],
                        pval  = lm_mean_fe$p.value[1]) %>% 
    mutate(var = i,
           group = "mean_fe")
  
  lm_mean_fe_pic <- estimatr::lm_robust(mean_abs_fe_pic ~ 1, clusters = time, data = fe_reg_data)
  
  out_mean_fe_pic <- tibble(coeff = lm_mean_fe_pic$coefficients[1],
                            pval  = lm_mean_fe_pic$p.value[1]) %>% 
    mutate(var = i,
           group = "mean_fe_pic")
  
  lm_mean_fe_diff <- estimatr::lm_robust(diff ~ 1, clusters = time, data = fe_reg_data)
  
  out_mean_fe_diff <- tibble(coeff = lm_mean_fe_diff$coefficients[1],
                             pval  = lm_mean_fe_diff$p.value[1]) %>% 
    mutate(var = i,
           group = "mean_fe_diff")
  
  
  fe_list[[i]]      <- out_mean_fe
  fe_pic_list[[i]]  <- out_mean_fe_pic
  fe_diff_list[[i]] <- out_mean_fe_diff
  
}

debias_fe      <- bind_rows(fe_list)
debias_fe_pic  <- bind_rows(fe_pic_list)
debias_fe_diff <- bind_rows(fe_diff_list)

debias_table <- debias_fe %>% 
  select(var, coeff) %>% 
  rename(abs_fe_all = coeff) %>% 
  cbind(debias_fe_pic %>% select(coeff) %>% rename(abs_fe_pic = coeff)) %>% 
  cbind(debias_fe_diff %>% select(coeff, pval) %>% rename(diff = coeff)) %>% 
  mutate_if(is.numeric, round, digits = 4)

write.csv(debias_table, "02_OutData/de_bias_table.csv")



